# install packages
#install.packages("plyr")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("ggmap")
# declare libraries
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggmap)
# read in citibike data
citiBike <- read.csv("201511-citibike-tripdata.csv")
Identify patterns in the ride history data. Explain these patterns using appropriate visualization. Two potential patterns are (this is an illustrative list, you should formulate your own patterns as well to be explored here):
endStation<-as.data.frame(table(citiBike$end.station.id))
startStation<-as.data.frame(table(citiBike$start.station.id))
stationData <- merge(startStation, endStation, by="Var1", all=TRUE)
stationData[is.na(stationData)] <- 0
names(stationData) <- c("station.id", "start.freq", "end.freq")
stationData$inTraffic <- stationData$start.freq - stationData$end.freq
stationData$outTraffic <- stationData$end.freq - stationData$start.freq
The stations with the most outgoing traffic are:
outTraffic <- stationData[order(-stationData$outTraffic),]
barplot(outTraffic$outTraffic[outTraffic$outTraffic > 300], main="Bike Stations with Highest Net Outflow of Bikes", names.arg = outTraffic$station.id[outTraffic$outTraffic > 300], xlab = "Station IDs", ylab = "Net Outflow (Bikes)")
The stations with the most incoming traffic are:
inTraffic <- stationData[order(-stationData$inTraffic),]
barplot(inTraffic$inTraffic[inTraffic$inTraffic > 300], main="Bike Stations with Highest Net Inflow of Bikes", names.arg = inTraffic$station.id[inTraffic$inTraffic > 300], xlab = "Station IDs", ylab = "Net Inflow (Bikes)")
longestRide <- aggregate(citiBike[, 1], list(citiBike$start.station.id), mean)
names(longestRide) <- c("station.id", "mean.trip.duration")
stationData <- merge(stationData, longestRide, by="station.id", all=TRUE)
stationData[is.na(stationData)] <- 0
longestRide <- stationData[order(-stationData$mean.trip.duration),]
barplot(longestRide$mean.trip.duration[longestRide$mean.trip.duration > 2000], main = "Stations that Originate the Longest Rides", names.arg = longestRide$station.id[longestRide$mean.trip.duration > 2000], xlab = "Station IDs", ylab = "Ride Duration")
citiBike$time.slot = strtoi(format(as.POSIXct(citiBike$starttime, format = "%m/%d/%Y %H:%M:%S"), format = "%H"), base = 10L)
citiBike$time.slot.category[citiBike$time.slot < 8] = "earlyMorning"
citiBike$time.slot.category[citiBike$time.slot < 12 & citiBike$time.slot >= 8] <- "morning"
citiBike$time.slot.category[citiBike$time.slot < 16 & citiBike$time.slot >= 12] <- "afternoon"
citiBike$time.slot.category[citiBike$time.slot < 20 & citiBike$time.slot >= 16] <- "evening"
citiBike$time.slot.category[citiBike$time.slot >= 20] <- "lateEvening"
longest.ride.category <- aggregate(citiBike[, 1], list(citiBike$start.station.id, citiBike$time.slot.category), mean)
names(longest.ride.category) <- c("station.id", "time.slot", "mean")
longest.ride.category <- longest.ride.category[order(-longest.ride.category$mean),]
early.morning.ride = longest.ride.category[longest.ride.category$time.slot == "earlyMorning",]
barplot(early.morning.ride$mean[early.morning.ride$mean > 2100], names.arg = early.morning.ride$station.id[early.morning.ride$mean > 2100], main="Stations that Originate the Longest Rides in the Early Morning", xlab = "Station IDs", ylab = "Mean Ride Duration")
morning.ride = longest.ride.category[longest.ride.category$time.slot == "morning",]
barplot(morning.ride$mean[morning.ride$mean > 2100], names.arg = morning.ride$station.id[morning.ride$mean > 2100], main="Stations that Originate the Longest Rides in the Morning", xlab = "Station IDs", ylab = "Mean Ride Duration")
afternoon.ride = longest.ride.category[longest.ride.category$time.slot == "afternoon",]
barplot(afternoon.ride$mean[afternoon.ride$mean > 2500], names.arg = afternoon.ride$station.id[afternoon.ride$mean > 2500], main="Stations that Originate the Longest Rides in the Afternoon", xlab = "Station IDs", ylab = "Mean Ride Duration")
evening.ride = longest.ride.category[longest.ride.category$time.slot == "evening",]
barplot(evening.ride$mean[evening.ride$mean > 2100], names.arg = evening.ride$station.id[evening.ride$mean > 2100], main="Stations that Originate the Longest Rides in the Evening", xlab = "Station IDs", ylab = "Mean Ride Duration")
late.evening.ride = longest.ride.category[longest.ride.category$time.slot == "lateEvening",]
barplot(late.evening.ride$mean[late.evening.ride$mean > 2500], names.arg = late.evening.ride$station.id[late.evening.ride$mean > 2500], main = "Stations that Originate the Longest Rides in the Late Evening", xlab = "Station IDs", ylab = "Mean Ride Duration")
# change
# ===============================================================================
# ===============================[calculations]==================================
# ===============================================================================
# =============================[most used routes]================================
# function returns a unique integer from two integer inputs
cantor_pairing_function <- function(a, b){
return(0.5 * (a + b) * (a + b + 1) + b)
}
# get all routes taken
routes <- subset(citiBike, select = c(start.station.id, end.station.id, start.station.latitude, start.station.longitude, end.station.latitude, end.station.longitude))
colnames(routes) <- c("start.id", "end.id", "start.lat", "start.lon", "end.lat", "end.lon")
# apply unique route.id based on the cantor pairing function, with start.id and end.id as inputs
routes$route.id = cantor_pairing_function(routes$start.id, routes$end.id)
# count # of times a route has been used
route_counts <- routes %>%
count(route.id)
colnames(route_counts) <- c("route.id", "route.count")
# get only unique routes
unique_routes <- unique(routes)
# merge route count with ids, lats, and longitudes
route_counts <- join(unique_routes, route_counts, by = "route.id", type = "full", match = "first")
route_counts <- subset(route_counts, select = -c(route.id) )
# sort by highest count
route_counts <- route_counts[with(route_counts, order(-route.count)), ]
# =============================[station surplus]================================
# get arrivals to station
arrival_counts <- citiBike %>%
count(end.station.id)
colnames(arrival_counts) <- c("id", "arr")
# get departures from station
departure_counts <- citiBike %>%
count(start.station.id)
colnames(departure_counts) <- c("id", "dep")
# get unique stations
stations <- unique(subset(citiBike, select = c(start.station.id, start.station.latitude, start.station.longitude)))
colnames(stations) <- c("id", "lat", "lon")
# merge arrivals and departures with list of stations
stations <- join(stations, arrival_counts, by = "id", type = "full", match = "first")
stations <- join(stations, departure_counts, by = "id", type = "full", match = "first")
# calculate surplus of station
stations$surplus = stations$arr - stations$dep
# seperate surplus and deficit stations
stations.surplus <- subset(stations, surplus >= 0)
stations.deficit <- subset(stations, surplus <= 0)
# ===============================================================================
# ==================================[mapping]====================================
# ===============================================================================
min_lat <- min(c(min(citiBike$start.station.latitude), min(citiBike$end.station.latitude))) + 0.025
max_lat <- max(c(max(citiBike$start.station.latitude), max(citiBike$end.station.latitude))) + 0.0055
min_lon <- min(c(min(citiBike$start.station.longitude), min(citiBike$end.station.longitude))) + 0.029
max_lon <- max(c(max(citiBike$start.station.longitude), max(citiBike$end.station.longitude))) + 0.005
nyc <- c(left = min_lon, bottom = min_lat, right = max_lon, top = max_lat)
basemap <- get_map(nyc, zoom = 13, maptype = "terrain", source = "stamen")
## Map from URL : http://tile.stamen.com/terrain/13/2411/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3081.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3081.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3081.jpg
# =============================[most used routes]================================
# declare variables
routes_to_show <- 10
origin <- c()
dest <- c()
# get routes
for(i in 1:routes_to_show){
origin <- append(origin, paste(routes$start.lat[i], routes$start.lon[i], sep=","))
dest <- append(dest, paste(routes$end.lat[i], routes$end.lon[i], sep=","))
}
# initialize routes map
map.routes <- ggmap(basemap, base_layer = ggplot(stations, aes(x = lon, y = lat)))
# build routes map
for(i in 1:(routes_to_show)){
k <- (i - 1) / routes_to_show
map.routes <- map.routes +
geom_path(aes(x = lon, y = lat), data = route(origin[i], dest[i], structure = "route"),
color = "darkred", size = 2 - (k * (2 - 0.5)), alpha = 1 - (k * (1 - 0.5)))
}
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74025878,-73.98409214&destination=40.71893904,-73.99266288&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74025878,-73.98409214&destination=40.71893904,-73.99266288&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74144387,-73.97536082&destination=40.74854862,-73.98808416&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.72743423,-73.99379025&destination=40.72405549,-74.00965965&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.73454567,-73.99074142&destination=40.722103786686,-73.997249007225&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71910537,-73.99973337&destination=40.74394314,-73.97966069&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.749156,-73.9916&destination=40.74444921,-73.98303529&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.700469,-73.991454&destination=40.68926942,-73.98912867&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71542197,-74.01121978&destination=40.756014,-73.967416&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.750967348716,-73.994442075491&destination=40.73454567,-73.99074142&mode=driving&units=metric&alternatives=false&sensor=false
# display route map
map.routes
# =============================[station surplus]================================
# build surplus stations map
map.surplus <- ggmap(basemap, base_layer = ggplot(stations.surplus, aes(x = lon, y = lat))) +
geom_point(aes(color = surplus, size = surplus), alpha = .8) +
scale_color_gradient(low = "green", high = "darkgreen") +
scale_size(range = c(0.5, 10))
# build deficit stations map
map.deficit <- ggmap(basemap, base_layer = ggplot(stations.deficit, aes(x = lon, y = lat))) +
geom_point(aes(color = surplus, size = surplus), alpha = .8) +
scale_color_gradient(low = "darkred", high = "red") +
scale_size(range = c(30, 1))
# display surplus and deficit maps
map.surplus
map.deficit
# The stations that are start stations the most number of times are likely the best candidates for a higher bike storage capacity
startStationCount <- as.data.frame(table(citiBike$start.station.id))
startStationCount <- startStationCount[order(startStationCount$Freq, decreasing=TRUE),]
head(startStationCount)
## Var1 Freq
## 286 519 11267
## 81 293 8810
## 206 435 8268
## 265 497 7740
## 77 285 7175
## 16 151 6172
####Client is planning a promotion to increase bike rental by younger folks (age < 25) and tourists (i.e not an annual member). They want to focus on stations that are less used by these target groups. Can you help them identify these stations?
#Qu estion for group: should I combine these two groups into one demographic, or leave them separate?
#Some birth dates are kept empty, can't do anything in those situations.
head(citiBike)
## tripduration starttime stoptime start.station.id
## 1 1110 11/1/2015 00:00:00 11/1/2015 00:18:31 537
## 2 1094 11/1/2015 00:00:01 11/1/2015 00:18:15 537
## 3 520 11/1/2015 00:00:05 11/1/2015 00:08:45 536
## 4 753 11/1/2015 00:00:15 11/1/2015 00:12:48 229
## 5 353 11/1/2015 00:00:22 11/1/2015 00:06:15 285
## 6 1285 11/1/2015 00:00:22 11/1/2015 00:21:48 268
## start.station.name start.station.latitude start.station.longitude
## 1 Lexington Ave & E 24 St 40.74026 -73.98409
## 2 Lexington Ave & E 24 St 40.74026 -73.98409
## 3 1 Ave & E 30 St 40.74144 -73.97536
## 4 Great Jones St 40.72743 -73.99379
## 5 Broadway & E 14 St 40.73455 -73.99074
## 6 Howard St & Centre St 40.71911 -73.99973
## end.station.id end.station.name end.station.latitude
## 1 531 Forsyth St & Broome St 40.71894
## 2 531 Forsyth St & Broome St 40.71894
## 3 498 Broadway & W 32 St 40.74855
## 4 328 Watts St & Greenwich St 40.72406
## 5 151 Cleveland Pl & Spring St 40.72210
## 6 476 E 31 St & 3 Ave 40.74394
## end.station.longitude bikeid usertype birth.year gender time.slot
## 1 -73.99266 22545 Subscriber 1981 2 0
## 2 -73.99266 23959 Subscriber 1980 1 0
## 3 -73.98808 22251 Subscriber 1988 1 0
## 4 -74.00966 15869 Subscriber 1981 1 0
## 5 -73.99725 21645 Subscriber 1987 1 0
## 6 -73.97966 14788 Customer NA 0 0
## time.slot.category
## 1 earlyMorning
## 2 earlyMorning
## 3 earlyMorning
## 4 earlyMorning
## 5 earlyMorning
## 6 earlyMorning
young.customers <- subset(citiBike, citiBike$birth.year > 1991)
young.startStation <- as.data.frame(table(young.customers$start.station.id))
head(young.startStation)
## Var1 Freq
## 1 72 65
## 2 79 106
## 3 82 21
## 4 83 35
## 5 116 172
## 6 119 8
sorted.young <- young.startStation[order(young.startStation$Freq),]
head(sorted.young)
## Var1 Freq
## 330 3052 1
## 401 3127 1
## 448 3221 1
## 192 418 2
## 325 3044 2
## 366 3089 2
#It may be useful to classify stations "less used" as those with a certain frequency. This would mean taking a subset of sorted.young.
#User type is either subscriber or customer, so I'm assuming tourists are customers
tourist.customers <- subset(citiBike,citiBike$usertype == "Customer")
tourist.startStation <- as.data.frame(table(tourist.customers$start.station.id))
head(tourist.startStation)
## Var1 Freq
## 1 72 261
## 2 79 264
## 3 82 155
## 4 83 139
## 5 116 187
## 6 119 23
sorted.tourist <- tourist.startStation[order(tourist.startStation$Freq),]
head(sorted.tourist)
## Var1 Freq
## 312 2005 1
## 334 3054 2
## 30 218 5
## 339 3059 5
## 330 3049 8
## 453 3221 8